4. Main Analysis
# plot rating distribution fill by district(postcode)
ggplot(data = food, aes(food$Rating, fill=factor(food$zip_code))) + geom_histogram(aes(x= food$Rating,y = (..count..)/sum(..count..)), breaks=c(0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0),
position = 'dodge') #+ geom_density(aes(y=..density..),alpha = .3)
We can see that the resturant’s rating in district 10036 has a peak around 3.5. And the resturant’s rating in distrct 10019 has a peak around 3.0
## For price, there is no much difference in 10019 and 10036, both of the districts have 2 price resturants.
#ggplot(data = food, aes(food$Price, fill=factor(food$Postcode))) + geom_histogram(aes(x= food$Price), binwidth = 1,position = 'dodge')
ggplot(data = food, aes(food$Price, fill=factor(food$zip_code))) + geom_histogram(aes(x= food$Price,y = ..density..),binwidth = 1,position = 'dodge')
# FIX ME LATER! category data need to be clean
# ### FIX ME: draw density of the plot
For price, there is no much huge difference in 10019 and 10036, both of the districts most have 2 price resturants. But there is no retuarant have 1 price in 10019.
And we can see that 10011, 10012, 10033 only have 1 retuarant in that district, so the density will always be 1.
## plot the review based on postcode.
ggplot(data = food, aes(food$Review_Count, fill=factor(food$zip_code))) + geom_histogram(position = 'dodge',binwidth = 50) + xlim(0, 800)
## we draw some boxplot to explore the data.
## we find that restuarnts in 10036 are more likely to have more reviews.
ggplot(food, aes(factor(food$zip_code), food$Review_Count)) +
geom_boxplot() +
coord_flip()
##
For review count, we find that restuarnts in 10036 are more likely to have more reviews. The restuarant in 10018 are more sparse.
## hex plot or scatter plot
ggplot(food,aes(x = Rating,y = Review_Count))+stat_bin_hex()
ggplot(food,aes(x = Rating,y = Review_Count))+geom_point(position = 'jitter', alpha = .3)
From the plot we can see that, 1. there is a cluster around rating 3~4 and review less that 500. 2. the more review, there is more likely to be high ratings. 3. There is no resturant have more than 600 review that has low ratings. 4. The resturant having over 1000 reviews are outliers and they are all rating 3.5 or 4.0
# we explore the relationship between price and review
ggplot(food,aes(x = Price,y = Review_Count))+stat_bin_hex()
ggplot(food,aes(x = Price,y = Review_Count))+geom_point(position = 'jitter', alpha = .3)
1.there is a cluster around Price 1-2 and review below 500. 2. Outliers are the points over 1000 reviews and over 3 ratings.
data_new<-subset(food,food$zip_code %in% c(10036,10019,10018,10020))
data_new<- subset(data_new,data_new$Category_2nd_Level %in% c('North American','Deli','Europe','Asian'))
counts3 <- data_new %>% drop_na(`zip_code`,`Category_2nd_Level`)%>%group_by(data_new$`Category_2nd_Level`, data_new$`zip_code`) %>% summarize(Freq =n())
colnames(counts3)<-c('Category_data','zip_code','Freq')
vcd::mosaic(factor(Category_data)~zip_code,direction = c('v','h'),counts3,rot_labels=c(0,90,0,0))
From the plot we can see that the majority data lies in 10019 and 10036. And district will influence the retuarants category. We can see that there are more North American restuarant in 10036, and relatively less other types retuarants.
Interactive Parallel Coordinate
library(GGally)
library(ggplot2)
library(tidyverse)
parallel_data <-food[c(4,5,6,7,8,9,11)]
parallel_data$Category_data<-factor(parallel_data$Category_data)
#parallel_data$zip_code<-factor(parallel_data$zip_code)
parallel_data$Category_2nd_Level<-factor(parallel_data$Category_2nd_Level)
parallel_data$Price<-factor(parallel_data$Price)
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "uniminmax")+ylab('Data')+xlab('Indicator')
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "globalminmax")+ylab('Data')+xlab('Indicator')
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "robust")+ylab('Data')+xlab('Indicator')
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "std")+ylab('Data')+xlab('Indicator')
# See: http://www.buildingwidgets.com/blog/2015/1/30/week-04-interactive-parallel-coordinates-1
devtools::install_github("timelyportfolio/parcoords")
library(parcoords)
parallel_data<-subset(parallel_data,parallel_data$Review_Count%in%
c(1:1000))
#parallel_data$zip_code<-factor(parallel_data$zip_code)
parcoords(parallel_data
, rownames = F
, brushMode = "2D-strums"
, reorderable = T
, queue = T
, alpha = .5
, color = list(
colorBy = "Category_2nd_Level"
,colorScale = htmlwidgets::JS("d3.scale.category10()")
)
)
food_pl <- as.data.frame(food[, c(2,4,5,6,7,8,9,10,11)])
food_pl$Score <- round(food_pl$Score)
food_pl$Rating <- round(food_pl$Rating)
food_pl[, 1:9] <- lapply(food_pl[, 1:9], factor)
food_plna <- na.omit(food_pl, cols=c("Score"))
colnames(food_plna)
## [1] "ID" "Category_data" "Category_2nd_Level"
## [4] "Rating" "Review_Count" "Price"
## [7] "Street_Num" "zip_code" "Score"
food_al <- food_plna %>%
#drop_na(Score) %>%
group_by(Category_2nd_Level, Rating, Price, Street_Num, zip_code, Review_Count,Category_data) %>% #
summarise(Freq = n())
#tidyfood2 <- food %>%
# group_by(Category_data, Category_2nd_Level, Rating, Review_Count, Street_Num, zip_code) %>%
# summarise(n = sum(ID))
#tidyfd <- food %>% rownames_to_column("Name") %>%
# gather(key = ID, value = , -Name, -School)
library(alluvial)
pal <- RColorBrewer::brewer.pal(10, "Set3")
alluvial(food_al[, c("Category_2nd_Level", "Rating", "Price", "zip_code")], freq = food_al$Freq,
blocks = TRUE,
alpha = 0.8,
col = pal[match(food_al$Category_2nd_Level,
unique(food_al$Category_2nd_Level)) ])